Getting names data

This data comes from kaggle.com, which has collected all recorded names for children born in the United States between the years 1880-2014.

## first read in baby names csv
## names <- read.csv('Baby-Name-Project/data/raw_data/NationalNames.csv')
## save as RDS and remove CSV to save space
names <- readRDS('Baby-Name-Project/data/raw_data/all-names.rds')

## reading in baby names by stat
## state <- read.csv('Baby-Name-Project/data/raw_data/StateNames.csv')
## save as RDS file and remove CSV
state <- readRDS('Baby-Name-Project/data/raw_data/state-names.rds')

Limiting to boys named Nathan or Nate

## filter out only babies with names 'Nathan' or 'Nate' and 'Male'
## this is accomplished using the filter() function available through dplyr package
dnn <- names %>% 
  filter(Gender == 'M', 
         Name == 'Nathan' | Name == 'Nate' | Name == 'Nathanial' | Name == 'Nathaniel' | Name == 'Nathanael')

## filtering out all other names for each state
state_dnn <- state %>% 
  filter(Gender == 'M', Name == 'Nathan')

## summing total number of Nathan's for each state
state_dnn_sum <- state_dnn %>% 
  group_by(State) %>%
  select(Name, state=State, Count) %>%
  summarize(total=sum(Count))

Creating interactive plot

## cleaner looking graph with annotations
p1 <- ggplot(data=dnn, aes(x=Year, y=Count, color=Name)) + 
  geom_line(size=1) + 
  labs(title='Baby boys with Nathan-related names',
       subtitle='United States, 1880-2014', 
       caption='Source: www.kaggle.com') + 
  ylab('Number of babies') +
  scale_x_continuous(breaks=seq(1880,2014, by=10)) +
  annotate('rect', xmin=1989, xmax=1991, ymin=0, ymax=Inf, fill= 'cadetblue3', alpha=0.6) + 
  annotate('text', label = '1990', y=12500, x=1988, size=5, hjust='right') + 
  theme(
    plot.title = element_text(face='bold', size = 16),
    plot.subtitle = element_text(size=13),
    plot.margin = unit(c(1,1,1,1), 'lines'),
    axis.text = element_text(size=10, color='black'),
    axis.title.y = element_text(size=12, face='bold', margin = margin(t=0,r=10,b=0,l=0)),
    axis.title.x = element_text(size=12, face='bold', margin = margin(t=10,r=0,b=0,l=0)),
    legend.position = c(0.15,0.7),
    legend.text = element_text(size=12),
    legend.title = element_text(size=12, face='bold'),
    legend.background = element_rect(fill='white', size=0.5, linetype='solid', color='black')
  )

p1

Interactive Plot

## interactive plot
p2 <- ggplot(data=dnn, aes(x=Year, y=Count, color=Name)) + 
  geom_line() + 
  geom_point() + 
  ylab('Number of babies') + 
  theme(
    plot.margin = unit(c(1,1,1,1), 'lines')
  )

## in order to make interactive, we will view the plot using ggplotly() function.
ggplotly(p2)
## you should be able to hover mouse over individual points to see count and year
## if you don't want the graph to show up inside R markdown file:
    ## click on gear aside of knit at top
    ## select 'Chuck Output in Console'

Mapping total number of Nathans by state

Making a heatplot for total number of Nathans born in each state

p3 <- plot_usmap(data=state_dnn_sum, values = 'total') + 
  scale_fill_continuous(name = 'Total', low='blue', high='red', labels=scales::comma) + 
  labs(title='Total Number of Nathans By State', 
       subtitle = 'US babies born 1880-2014', 
       caption='Source: Kaggle.com') + 
  theme(
    legend.position = 'right',
    legend.title = element_text(size=11, face='bold'),
    legend.text = element_text(size=9),
    plot.title = element_text(size=16, face='bold'),
    plot.subtitle = element_text(size=13),
    plot.caption = element_text(size=9)
  )

p3